# Running first cell for *weekly* "Search Trends" dataset collection, cleaning, and partial display
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
url = 'symptoms_weekly.csv'
df1 = pd.read_csv(url,index_col=0,parse_dates=[0])
# Find the columns where every value is null
empty_cols = [col for col in df1.columns if df1[col].isnull().all()]
# Drop these columns from the dataframe
df1.drop(empty_cols,
axis=1,
inplace=True)
df1 = df1[df1['sub_region_1_code'].notna()]
df1['open_covid_region_code']=df1['sub_region_1_code']
df1['state_codes']=df1['open_covid_region_code'].str[3:]
#drop 'sub_region_1_code' column bc redundant
redundant = ['place_id']
df1=df1.drop(columns=redundant)
# Delete rows which contains less than 30 non NaN values
df1=df1.dropna(thresh=390)
df1['date']= pd.to_datetime(df1['date'])
#create multi-index constructed from date and open_covid_region_code
df1 = df1.set_index(['open_covid_region_code','date'])
print (df1.head())
print (df1.shape)
# Running second cell for "COVID Hospitalization Cases" dataset collection, cleaning, and partial display
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
url = 'https://raw.githubusercontent.com/google-research/open-covid-19-data/master/data/exports/cc_by/aggregated_cc_by.csv'
df2 = pd.read_csv(url,index_col=0,parse_dates=[0])
#replace 0 values with null
df2 = df2.replace({0.0:np.nan})
#get rid of irrelevant features
relevant_col = ['region_name', 'date', 'hospitalized_new']
df2=df2[relevant_col]
# Delete rows which have no value for hospitalized_new
df2=df2.dropna(subset = ['hospitalized_new'])
#convert date format and resample to weekly
df2['date'] = pd.to_datetime(df2['date'])
df2=df2.set_index('date').groupby([df2.index, 'region_name']).resample('W', label='left', loffset=pd.DateOffset(days=1)).sum().fillna(0).reset_index().set_index('open_covid_region_code')
#create multi-index constructed from open_covid_region_code and 'date'
df2 = df2.set_index([df2.index, 'date'])
print (df2.head(10))
print (df2.shape)
# Running third cell for merging the weekly symptoms and hospitalization datasets. One dataframe object for visualization and one for machine learning algorithms.
# Both dataframes objects are shown partially.
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
inner_df = pd.merge(df1, df2, left_index=True, right_index=True)
#inner_df contains only points that have data for both symptoms and hospitalizations
left_df = pd.merge(df1, df2, left_index=True, right_index=True, how='left')
#left_df contains all rows in df1, even if hospitalized_new value is null
print (left_df.head(5))
print(left_df.shape)
print (inner_df.head(5))
print (inner_df.shape)
left_df.to_csv('out.csv')
Part 2: Visualization
Now, we visualize the data from the symptoms dataset after computing the mean column values to eliminate all but the most common symptoms, and then choosing the most relevant of those to COVID-19.
#visualization of weekly dataset
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
df = pd.read_csv('out.csv')
df=df.dropna(axis=1, thresh=100)
#keep only sympoms (columns) with highest mean search frequency
means = df.mean()
most_pop = [ind for m,ind in zip(means,means.index) if m>6]
state_codes = df['state_codes']
date = df['date']
open_covid_region_code = df['open_covid_region_code']
df = df[most_pop]
df['state_codes']= state_codes
df['open_covid_region_code'] = open_covid_region_code
df['date'] = date
df['date_by_week'] = date
df = df.set_index(['open_covid_region_code', 'date'])
print(most_pop)
print(df.head(5))
fig = px.choropleth(df, locations=df['state_codes'],
color=df['symptom:Cough'],
hover_name=df['symptom:Cough'],
locationmode = 'USA-states',
animation_frame=df['date_by_week'],
title = "'Cough' Google Search Frequency in USA",
scope = "usa",
color_continuous_scale=px.colors.sequential.PuRd)
fig["layout"].pop("updatemenus")
fig2 = px.choropleth(df, locations=df['state_codes'],
color=df['symptom:Common cold'],
hover_name=df['symptom:Common cold'],
locationmode = 'USA-states',
animation_frame=df['date_by_week'],
title = "'Common Cold' Google Search Frequency in USA",
scope = "usa",
color_continuous_scale=px.colors.sequential.PuRd)
fig2["layout"].pop("updatemenus")
fig3 = px.choropleth(df, locations=df['state_codes'],
color=df['symptom:Fever'],
hover_name=df['symptom:Fever'],
locationmode = 'USA-states',
animation_frame=df['date_by_week'],
title = "'Fever' Google Search Frequency in USA",
scope = "usa",
color_continuous_scale=px.colors.sequential.PuRd)
fig3["layout"].pop("updatemenus")
fig4 = px.choropleth(df, locations=df['state_codes'],
color=df['symptom:Infection'],
hover_name=df['symptom:Infection'],
locationmode = 'USA-states',
animation_frame=df['date_by_week'],
title = "'Infection' Google Search Frequency in USA",
scope = "usa",
color_continuous_scale=px.colors.sequential.PuRd)
fig4["layout"].pop("updatemenus")
fig5 = px.choropleth(df, locations=df['state_codes'],
color=df['symptom:Pain'],
hover_name=df['symptom:Pain'],
locationmode = 'USA-states',
animation_frame=df['date_by_week'],
title = "'Pain' Google Search Frequency in USA",
scope = "usa",
color_continuous_scale=px.colors.sequential.PuRd)
fig5["layout"].pop("updatemenus")
fig6 = px.choropleth(df, locations=df['state_codes'],
color=df['symptom:Anxiety'],
hover_name=df['symptom:Anxiety'],
locationmode = 'USA-states',
animation_frame=df['date_by_week'],
title = "'Anxiety' Google Search Frequency in USA",
scope = "usa",
color_continuous_scale=px.colors.sequential.PuRd)
fig6["layout"].pop("updatemenus")
fig.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()